import time
import requests
from bs4 import BeautifulSoup
from openpyxl import workbook
def get_data(url,headers):
    response=requests.get(url,headers=headers)
    html_data=response.text
    # print(html_data)
    return html_data

def parse_data(html_data):
    soup=BeautifulSoup(html_data,'lxml')
    headlines=soup.find_all('p',{'class':'content__list--item--title'})
    prices=soup.find_all('span',{'class':'content__list--item-price'})
    details=soup.find_all('p',{'class':'content__list--item--des'})
    #print(len(headlines))
    for headline,price,detail in zip(headlines,prices,details):
        #标题
        h=headline.get_text().strip()
        #标题链接
        hl='https://hf.lianjia.com'+headline.a['href']
        #价格
        p=price.get_text()
        #详情
        d=detail.get_text()
        d=str(d)
        de=d.replace(' ','')
        de=de.replace('\n','')
        save_data(h,hl,p,de)
        print(h)
        print(hl)
        print(p)
        print(de)
        print('=================')

def save_data(h,hl,p,de):
    my_list=[h,hl,p,de]
    ws.append(my_list)
    wb.save('lianjia.xlsx')

if __name__ == '__main__':
    wb = workbook.Workbook()
    ws = wb.active
    ws.append(['标题', '标题链接', '价格', '详情'])
    #存取前10页信息
    for i in range(1,6):
        print('正在爬取第{}页'.format(i))
        time.sleep(3)
        url='https://hf.lianjia.com/zufang/pg{}'.format(i)
        headers={'user-agent': 'Mozilla/5.0 (X11) AppleWebKit/62.41 (KHTML, like Gecko) Edge/17.10859 Safari/452.6'
                 ,'Referer':'https://hf.lianjia.com/zufang/pg100/'
                 }

        html_data=get_data(url,headers)
        parse_data(html_data)